Read & tidy the data

Load the required packages

library(tidyverse)
library(cowplot)
library(maps)
library(ggmap)
library(viridis)
library(plotly)
library(caret) #confusion matrix
library(e1071)
library(class) #knn.cv

Import the data

breweries = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv')
beer = read_csv('https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv')

Breweries data has 558 observations and four columns:

  1. Brew_ID: Unique identifier of the brewery
  2. Name: Name of the brewery
  3. City: City where brewery is located
  4. State: U.S. State where brewery is located
dim(breweries)
## [1] 558   4
spec(breweries)
## cols(
##   Brew_ID = col_double(),
##   Name = col_character(),
##   City = col_character(),
##   State = col_character()
## )

Beer data has 2410 observations and seven columns:

  1. Name: Name of the beer
  2. Beer_ID: Unique identifier of the beer
  3. ABV: Alcohol by volume of the beer
  4. IBU; International Bitterness Units of the beer
  5. Brewery_id: Brewery identifier associated with the beer
  6. Style: Style of the beer
  7. Ounces: Ounces of the beer
dim(beer)
## [1] 2410    7
spec(beer)
## cols(
##   Name = col_character(),
##   Beer_ID = col_double(),
##   ABV = col_double(),
##   IBU = col_double(),
##   Brewery_id = col_double(),
##   Style = col_character(),
##   Ounces = col_double()
## )

Merge datasets by primary key

df = merge(breweries, beer, by.x = 'Brew_ID', by.y = 'Brewery_id')
head(df)

Rename Name.x and Name.y

df = rename(df, Brewery = Name.x, Beer = Name.y)
head(df)

Clean the data:

  1. Check the sum of NA values in each column
  2. Create new dataframe excluding the NA values (but also keep original dataframe)
  3. Confirm there are zero NA values left
cbind(lapply(lapply(df, is.na), sum))
##         [,1]
## Brew_ID 0   
## Brewery 0   
## City    0   
## State   0   
## Beer    0   
## Beer_ID 0   
## ABV     62  
## IBU     1005
## Style   5   
## Ounces  0
final = df %>% na.exclude()
sum(as.numeric(is.na.data.frame(final)))
## [1] 0

Study the dataset

Final dataset has 1403 observations and 10 columns:
1. Brew_ID: Unique identifier of the brewery
2. Brewery: Name of the brewery
3. City: City where brewery is located
4. State: U.S. state where brewery is located
5. Beer: Name of the beer
6. Beer_ID: Unique identifier of the beer
7. ABV: Alcohol by volume of the beer
8. IBU: International Bitterness Units of the beer
9. Ounces: Ounces of the beer
10. Style: Style of the beer

dim(final)
## [1] 1403   10
str(final)
## 'data.frame':    1403 obs. of  10 variables:
##  $ Brew_ID: num  1 1 1 1 1 1 2 2 2 2 ...
##  $ Brewery: chr  "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" "NorthGate Brewing" ...
##  $ City   : chr  "Minneapolis" "Minneapolis" "Minneapolis" "Minneapolis" ...
##  $ State  : chr  "MN" "MN" "MN" "MN" ...
##  $ Beer   : chr  "Pumpion" "Stronghold" "Parapet ESB" "Get Together" ...
##  $ Beer_ID: num  2689 2688 2687 2692 2691 ...
##  $ ABV    : num  0.06 0.06 0.056 0.045 0.049 0.048 0.042 0.08 0.125 0.077 ...
##  $ IBU    : num  38 25 47 50 26 19 42 68 80 25 ...
##  $ Style  : chr  "Pumpkin Ale" "American Porter" "Extra Special / Strong Bitter (ESB)" "American IPA" ...
##  $ Ounces : num  16 16 16 16 16 16 16 16 16 16 ...
##  - attr(*, "na.action")= 'exclude' Named int [1:1007] 19 35 36 37 38 39 40 41 42 43 ...
##   ..- attr(*, "names")= chr [1:1007] "19" "35" "36" "37" ...

The first six and last rows of the dataframe

head(final)
tail(final)

Exploratory Data Analysis

How many breweries are in each state?

breweries %>% group_by(State) %>% tally(sort=T)

Plot into a heatmap

states = map_data('state')
st_abb = data.frame(abb = state.abb, region = tolower(state.name))
brew = breweries %>% group_by(State) %>% tally(sort=T)
brew = subset(brew, State != 'HI' & State != 'AK') 
brew = rename(brew, abb = State, count = n)
brew = merge(brew, st_abb, by='abb')
geo = merge(states, brew, by='region', all.x=T)
geo = geo[order(geo$order),]
center <- data.frame(region=tolower(state.name), long=state.center$x, lat=state.center$y)
center <- merge(center, brew, by="region", all.x = TRUE)
ggplot(geo, aes(x=long,y=lat)) +
geom_polygon(aes(group=group, fill=count)) +
geom_text(data=center, aes(long, lat, label=count)) +
scale_fill_gradient(low = "slategray1",
high = "royalblue4",
guide = "colorbar") +
ggtitle("Breweries per State") +
coord_map() + theme(axis.title = element_text(size = 20)) + theme_void() + theme(legend.position = c(0.9, 0.4), plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -0.1, t = 0.4, l = 2, unit = "cm")))

Compute median ABV for each state

median = final %>% select(State, ABV, IBU) %>% group_by(State) %>% summarize(med_abv = median(ABV), med_ibu = median(IBU))
arrange(median, -med_abv)
arrange(median, -med_ibu)
median %>% mutate(perc = med_abv*100, State = fct_reorder(State, perc)) %>% ggplot(aes(State, perc)) + geom_point(size=4, color="royalblue2") + coord_flip() + labs(y='ABV (%)', title='Median Alcohol by Volume of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))

Compute median IBU for each state

arrange(median, -med_ibu)
# no geom_segment
median %>% mutate(State = fct_reorder(State, med_ibu)) %>% ggplot(aes(State, med_ibu)) + geom_point(size=4, color="indianred2") + coord_flip() + labs(y='IBU', title='Median International Bitterness Unit of each State', subtitle = 'Decreasing Order') + theme_minimal() + theme(plot.title = element_text(size= 20), axis.title = element_text(size = 20))

Kentucky has the max ABV

max(final$ABV)
## [1] 0.125
grep(.125, final$ABV)
## [1] 9
final$State[9]
## [1] "KY"

Oregon has the highest IBU

max(final$IBU)
## [1] 138
grep(138, final$IBU)
## [1] 1132
final$State[1132]
## [1] "OR"

Summary statistics and distribution of ABV

Statistic Percent
Min 2.7
Med 5.7
Mean 6
Max 12.5
IQR 1.8

Right skew distribution suggests lower production at increasing ABV. Consumer preference (and thus sales) drives demand to infer most lean towards lower concentration of ABV. The median value of 5.7% is a better indicator of the distribution because the mean is pulled towards the skewed tail.

summary(final$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05700 0.05992 0.06800 0.12500
fivenum(final$ABV)
## [1] 0.027 0.050 0.057 0.068 0.125
IQR(final$ABV)
## [1] 0.018
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_boxplot(fill='royalblue', alpha=.3) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_cowplot() + theme(plot.title = element_text(size= 20), axis.ticks.y=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), legend.position = 'none', axis.title = element_text(size = 20)) + geom_segment(aes(x = 6, y = -.05, xend = 6, yend = .05, col='red'))

# density plot
final %>% mutate(abvperc = ABV*100) %>% ggplot(aes(abvperc)) + geom_density(fill='indianred2', color='indianred3', alpha=.9) + geom_vline(xintercept=5.7, col='cornflowerblue') + geom_vline(xintercept = 6, linetype='dotted', col='paleturquoise3', size=1.1) + labs(x='ABV (%)', title='Distribution of Alcohol by Volume') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

Relationship between ABV and IBU using polynomial regression model

The ABV and IBU values were log transformed. The linear regression line is depicted in green whereas polynomial regression line is in red. The latter model better fits the data however both are pretty similar.

final2 = final %>% mutate(log_abv = log(ABV), log_ibu = log(IBU))
x = final2$log_abv
y = final2$log_ibu
model3 = lm(y~x)
new_x = cbind(x, x^2)
model4 = lm(y~new_x)
ggplot(data = final2) + geom_point(aes(x = log_abv,y = log_ibu)) +
    geom_line(aes(x = log_abv,y = model3$fit), color = "blue", lwd=.8) +
    geom_line(aes(x = log_abv,y = model4$fit), color = "red", lwd=.8) + 
    labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear & Polynomial Regression Model on Log Transformed Data', x= 'log(ABV)', y='log(IBU)') + theme_classic()

new calculation

final = final %>% mutate(log_abv = log(ABV))
cor.test(x=final$log_abv, y=final$IBU)
## 
##  Pearson's product-moment correlation
## 
## data:  final$log_abv and final$IBU
## t = 34.032, df = 1401, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6430317 0.7004025
## sample estimates:
##       cor 
## 0.6727271
model = lm(IBU~log_abv, data=final)
summary(model)
## 
## Call:
## lm(formula = IBU ~ log_abv, data = final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.393 -12.410  -0.622  12.989  91.579 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  272.589      6.773   40.24   <2e-16 ***
## log_abv       80.972      2.379   34.03   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.22 on 1401 degrees of freedom
## Multiple R-squared:  0.4526, Adjusted R-squared:  0.4522 
## F-statistic:  1158 on 1 and 1401 DF,  p-value: < 2.2e-16

new ibu abv plot on logged abv

final %>% ggplot(aes(x=log_abv, y=IBU)) + geom_point(color="black", fill="#69b3a2", shape=22, alpha=0.5, size=3, stroke = 1) + geom_line(aes(x = log_abv,y = model$fit), color = "lightpink3", lwd=.8) + labs(title='Relationship between Bitterness of Beer and its Alcohol Content', subtitle = 'Linear Regression Model on Log Transformed ABV', x= 'ABV (logged)', y='IBU') + theme_classic() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

ABV vs. IBU by Style

View all 90 different styles

final %>% group_by(Style) %>% tally(sort=T)

Style df of just IPA/ale styles. 32 different styles of IPA and ale encompassing 944 observations and 10 columns

style = final %>% group_by(Style) %>% filter(grepl('\\sAle|IPA$', Style))
dim(style)
## [1] 944  11

Distinguish ale from ipa

all_style = final %>% group_by(Style)

keep = grep('\\sAle', all_style$Style)
ale = all_style[keep,]
rem = grep('India', ale$Style)
ale = ale[-rem,] #552 obs, 27 ale styles
ale$Style = 'ale'

ipa = grep(('\\sIndia\\sPale\\sAle|IPA$'), all_style$Style)
ipa = all_style[ipa,] # 392 obs, 5 ipa styles
ipa$Style = 'ipa'

ipa_ale = rbind(ipa, ale)
ipa_ale$Style = as.factor(ipa_ale$Style)

Code the plots

pmain = ipa_ale %>% ggplot(aes(x=ABV, y=IBU, shape=factor(Style))) + geom_point(aes(color=factor(Style))) + theme(legend.position = 'none', axis.title = element_text(size = 20), panel.background = element_rect(fill = "gray97", color = NA))

xdens = axis_canvas(pmain, axis='x') + geom_density(data=ipa_ale, aes(ABV, fill=Style), alpha=.7, size=.2)
ydens = axis_canvas(pmain, axis='y', coord_flip = TRUE) + geom_density(data=ipa_ale, aes(IBU, fill=Style), alpha=.7, size=.2) + coord_flip()
p1 = insert_xaxis_grob(pmain, xdens, grid::unit(.2, 'null'), position='top')
p2 = insert_yaxis_grob(p1, ydens, grid::unit(.2, 'null'), position='right')

Plot scatterplot with marginal plot on the axis

ggdraw(p2) + draw_label('ABV vs. IBU by Style', x=.12, y=.98, size=20) + draw_label('IPA', x=.52, y=.95, size=13, color='paleturquoise4', fontface = 'bold') + draw_label('Ale', x=.35, y=.97, size=13, color='rosybrown', fontface = 'bold')

Conduct KNN classifier to investigate the difference with respect to ABV and IBU between IPAs and other types of Ale

normalize = function(x) {
return ((x - min(x)) / (max(x) - min(x)))
                        }
ipa_ale = ipa_ale %>% mutate(norm_abv = normalize(ABV), norm_ibu = normalize(IBU))

model = knn.cv(ipa_ale[,c(12:13)], ipa_ale$Style, k=11)
confusionMatrix(model, ipa_ale$Style)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ale ipa
##        ale 460 117
##        ipa  92 275
##                                           
##                Accuracy : 0.7786          
##                  95% CI : (0.7507, 0.8047)
##     No Information Rate : 0.5847          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.5399          
##                                           
##  Mcnemar's Test P-Value : 0.09689         
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 0.7015          
##          Pos Pred Value : 0.7972          
##          Neg Pred Value : 0.7493          
##              Prevalence : 0.5847          
##          Detection Rate : 0.4873          
##    Detection Prevalence : 0.6112          
##       Balanced Accuracy : 0.7674          
##                                           
##        'Positive' Class : ale             
## 

Conduct t.test

ipa_ale = ipa_ale %>% mutate(logabv=log(ABV), logibu=log(IBU))
t.test(logabv~Style, data=ipa_ale)
## 
##  Welch Two Sample t-test
## 
## data:  logabv by Style
## t = -16.91, df = 849.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
##  -0.2260548 -0.1790364
## sample estimates:
## mean in group ale mean in group ipa 
##         -2.889941         -2.687395
t.test(logibu~Style, data=ipa_ale)
## 
##  Welch Two Sample t-test
## 
## data:  logibu by Style
## t = -30.993, df = 875.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group ale and group ipa is not equal to 0
## 95 percent confidence interval:
##  -0.8911237 -0.7849819
## sample estimates:
## mean in group ale mean in group ipa 
##          3.399569          4.237622

INSIGHT

IBU distribution on US map (static plot)

states = map_data('state')
usabeer = final %>% select(City, State, ABV, IBU)
usabeer = subset(usabeer, State != 'HI' & State != 'AK')
usabeer$citystate = str_c(usabeer$City, ", ", usabeer$State)
usabeer = cbind(geocode(as.character(usabeer$citystate)), usabeer)
usabeer = usabeer %>% arrange(IBU) %>% mutate(cs=factor(citystate, unique(citystate)))

ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group), fill='grey', alpha=.3) + geom_point(data=usabeer, aes(lon, y=lat, size=IBU, color=IBU, alpha=IBU), shape=20) + scale_size_continuous(name='IBU', range=c(1,10), breaks=c(1, 25, 50, 75, 100)) + scale_alpha_continuous(name='IBU', range=c(.1, .9), breaks=c(1, 25, 50, 75, 100)) + scale_color_viridis(option='magma', direction=-1, breaks=c(1, 25, 50, 75, 100), name='IBU') + theme_void() + coord_map() + borders('state') + guides( colour = guide_legend()) +
ggtitle("Distribution of International Bitterness Unit by Cities") +
theme(
legend.position = c(0.85, 0.25),
text = element_text(color = "#22211d"),
plot.background = element_rect(fill = "#f5f5f2", color = NA),
panel.background = element_rect(fill = "#f5f5f2", color = NA),
legend.background = element_rect(fill = "#f5f5f2", color = NA),
plot.title = element_text(size= 20, hjust=0.1, margin = margin(b = -.5, t = 0.4, l = 2, unit = "cm")),
)

Plot IBU distribution

summary(final$IBU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   21.00   35.00   42.74   64.00  138.00
usabeer %>% ggplot(aes(IBU)) + geom_density(fill='royalblue', alpha=.7) + geom_vline(xintercept=35, col='red') + geom_vline(xintercept = 42.74, linetype='dotted', col='turquoise', size=1.1) + labs(title='International Bitterness Unit') + theme_minimal_hgrid() + theme(axis.title = element_text(size = 20), plot.title = element_text(size= 20))

IBU distribution on US map (PLOTLY)

usabeer = usabeer %>% mutate( mytext=paste(citystate, "\n", "IBU: ", IBU, sep=""))

p.usa = ggplot() + geom_polygon(data=states, aes(x=long, y=lat, group=group)) + geom_point(data=usabeer, aes(x=lon, y=lat, size=IBU, color=IBU, alpha=IBU, text=mytext)) + scale_size_continuous(range=c(1,7)) + scale_color_viridis(option='inferno', direction = -1) + theme_void() + coord_map() + borders('state', fill='whitesmoke', alpha=.3) + labs(title='Distribution of International Bitterness Unit by Cities') + theme(legend.position = 'none', plot.title = element_text(size=25, hjust=.5))
p.usa = ggplotly(p.usa, tooltip='text')
p.usa

Number of breweries per city in descending order

nbrew = final %>% select(IBU, City, State) %>% group_by(City, State) %>% tally(sort=T)
nbrew

MAX IBU for each city

maxibu = final %>% select(IBU, City, State) %>% group_by(State, City) %>% slice(which.max(IBU))
maxibu = arrange(maxibu, -IBU)
maxibu

merge, conduct cor test

ibu.brew.cor = as.data.frame(cbind(maxibu$IBU, nbrew$n))
ibu.brew.cor = rename(ibu.brew.cor, max_ibu=V1, n_brew = V2)
cor(ibu.brew.cor) # .7939
##           max_ibu    n_brew
## max_ibu 1.0000000 0.7938968
## n_brew  0.7938968 1.0000000
cor.test(x=ibu.brew.cor$n_brew, y=ibu.brew.cor$max_ibu, method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  ibu.brew.cor$n_brew and ibu.brew.cor$max_ibu
## t = 22.196, df = 289, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7471146 0.8328525
## sample estimates:
##       cor 
## 0.7938968

Boston, MA IBU statistics. Boston is ranked 12th in breweries per city yet its maximum IBU is only 45. That’s 50% less than the max IBU compared to cities with similar demographics. In fact, if you take the max IBU of all the cities in MA and extract the median, Boston is still 30 IBU lower. I believe that Boston is an untapped market with a need for higher IBU beer production.

ma = maxibu %>% select(State, IBU) %>% filter(State == 'MA') %>% summarize(value = IBU)
## Adding missing grouping variables: `City`
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
mean(ma$value)
## [1] 68.09091
fivenum(ma$value)
## [1]  35.0  45.0  69.0  82.5 130.0